{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Requirements:\n",
    "    \n",
    "- Python version: Python 3.8.3\n",
    "- numpy version 1.19.2\n",
    "- pandas version 1.1.3\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compile Date of Death of Inventors\n",
    "Replication code to produce 'inventor_deathdate.csv'\n",
    "\n",
    "Required files:\n",
    "\n",
    "- f_ssdi_final_data.csv\n",
    "- f_gene_final_data.csv\n",
    "- f_fgi_final_data.csv\n",
    "- a_obi_final_data.csv\n",
    "- a_ssdi_final_data.csv\n",
    "- a_fgi_final_data.csv "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cd \"/our/folder/here/replication_data/death_data/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f_gene = pd.read_csv(\"f_gene_final_data.csv\")\n",
    "len(f_gene)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#editing the names of columns so it appends easily into one dataframe with the other sources\n",
    "f_gene=f_gene[['inventor_id', 'birth_year', 'app_date_min',\n",
    "       'app_date_max', 'patent_date_min', 'patent_date_max', 'f_gene_birth_date',\n",
    "       'f_gene_death_date', 'f_gene_burial_date', \n",
    "       'f_gene_resi_date', 'f_gene_obi_date',\n",
    "       'f_gene_score', 'f_gene_age_at_death',\n",
    "       'f_gene_death_year', 'f_gene_same_death_year_cnt',\n",
    "       'f_gene_unique_year_cnt', 'f_gene_total_cnt', 'min_death_year',\n",
    "       'max_death_year']].rename(columns = {'f_gene_birth_date': 'birth_date',\n",
    "       'f_gene_death_date':'death_date', 'f_gene_score' :'score', 'f_gene_age_at_death':'age_at_death',\n",
    "       'f_gene_death_year':'death_year', 'f_gene_same_death_year_cnt':'death_year_cnt',\n",
    "       'f_gene_unique_year_cnt':'unique_year_cnt', 'f_gene_total_cnt':'total_cnt','f_gene_resi_date':'resi_date'})\n",
    "f_gene['source']='f_gene'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f_gene = f_gene[f_gene['death_date'].notnull()]\n",
    "len(f_gene)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merging all of the datasources together\n",
    "lists = ['a_fgi','a_obi','a_ssdi','f_fgi', 'f_ssdi']\n",
    "\n",
    "all_data = []\n",
    "for i in lists:\n",
    "    data= pd.read_csv(str(i)+\"_final_data.csv\")\n",
    "    print('full data N',len(data))\n",
    "    print(data.columns)\n",
    "    data = data[['inventor_id', 'birth_year','app_date_min',\n",
    "       'app_date_max',str(i)+'_birth_date','patent_date_min', 'patent_date_max',\n",
    "       str(i)+'_death_date', str(i)+'_score', str(i)+'_age_at_death',\n",
    "       str(i)+'_death_year', str(i)+'_same_death_year_cnt',\n",
    "       str(i)+'_unique_year_cnt', str(i)+'_total_cnt', 'min_death_year',\n",
    "       'max_death_year']].rename(columns = {str(i)+'_birth_date': 'birth_date',\n",
    "       str(i)+'_death_date':'death_date', str(i)+'_score' :'score', str(i)+'_age_at_death':'age_at_death',\n",
    "       str(i)+'_death_year':'death_year', str(i)+'_same_death_year_cnt':'death_year_cnt',\n",
    "       str(i)+'_unique_year_cnt':'unique_year_cnt', str(i)+'_total_cnt':'total_cnt'})\n",
    "    data = data[data['death_date'].notnull()]\n",
    "    data['source']= i\n",
    "    print('death not null N', len(data))\n",
    "    all_data.append(data)\n",
    "all_data.append(f_gene)\n",
    "all_death_merged = pd.concat(all_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death_merged['source'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# There is some error in death year captured, so I am redoing it\n",
    "all_death= all_death_merged.drop('death_year',1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death['death_date'] = all_death['death_date'].astype(str)\n",
    "all_death['death_year'] = all_death['death_date'].str[-4:]\n",
    "all_death['death_year'] = all_death['death_year'].astype(int)\n",
    "#removing death date errors\n",
    "all_death = all_death[all_death['death_year']<2021]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#removing death date errors\n",
    "all_death = all_death[all_death['death_year']<2021]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# birth year collected from datasource\n",
    "all_death['birth_year']= all_death['birth_date'].str[-4:]\n",
    "all_death['birth_year']=all_death['birth_year'].replace(np.nan,0)\n",
    "all_death['birth_year'] = all_death['birth_year'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death[['inventor_id','source','score','birth_date', 'birth_year','death_date', 'death_year', 'f_gene_burial_date', 'resi_date',\n",
    "           'f_gene_obi_date']].to_csv('inventor_deathdate.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death['inventor_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death[['inventor_id','source','score','birth_date', 'birth_year','death_date', 'death_year', 'f_gene_burial_date', 'resi_date',\n",
    "           'f_gene_obi_date']].describe().round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_death['inventor_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
